rm(list=ls())
library(readstata13)
library(modelsummary)
library(data.table)
library(splitstackshape)
library(dplyr)
library(fastDummies)
library(rdrobust)
library(lubridate) 


# working directory
  setwd("")

# input stata dataset and save a copy
  raw <- read.dta13("")
  base<-as.data.table(raw)

###########################################################################################
# General preparations
###########################################################################################

# sort ids by year of interview
  datasummary(pid + hhp_id ~ N, data=base)
  base<-base[order(hhp_id,pid,entryd)]

# creating a variable with year of survey and months of survey
    fnames<- grep("entry", names(base), value=T)
    for (i in fnames) {
      temp <- tstrsplit(base[[i]], "-")
      set(base, j = sprintf("%s_%d", i, seq_along(temp)), value = temp)
    }
  #removing day variables
    base<-base[,c("entryd_3"  ,    "entryd_fup1_3",
                  "entryd_fup2_3", "entryd_fup3_3", "entryd_fup4_3" ,"entryd_fup5_3"):=NULL] 
    
    rm(fnames,temp,i,raw)
  
  #renaming year and month variables; saving as numeric
    base<-setnames(base,old=c("entryd_1", "entryd_2" ,   
                              "entryd_fup1_1" ,  "entryd_fup1_2",     
                              "entryd_fup2_1" ,  "entryd_fup2_2", 
                              "entryd_fup3_1" ,  "entryd_fup3_2", 
                              "entryd_fup4_1" ,  "entryd_fup4_2",      
                              "entryd_fup5_1" ,  "entryd_fup5_2" ), new=c("year_0", "month_0",
                                                                          "year_1", "month_1",
                                                                          "year_2", "month_2",
                                                                          "year_3", "month_3",
                                                                          "year_4", "month_4",
                                                                          "year_5", "month_5"))
    numer <- c("year_0", "month_0",
               "year_1", "month_1",
               "year_2", "month_2",
               "year_3", "month_3",
               "year_4", "month_4",
               "year_5", "month_5")
    
    
    base[, numer] <- base[, lapply(.SD, as.numeric), .SDcols = numer]
    rm(numer)

# creating a variable to indicate the if the respondent is present in the wave consecutive to a blood pressure reading
  base<-base[,w02:=ifelse(is.na(year_0)=="FALSE" & is.na(year_2)=="FALSE",1,0)] # baseline and follow-up 2
  base<-base[,w24:=ifelse(is.na(year_2)=="FALSE" & is.na(year_4)=="FALSE",1,0)] # follow-up 2 and follow-up 4

# female variable
  base<-base[,female:=ifelse(pd_sex=="Female",1,ifelse(pd_sex=="Male",0,NA))]
  
# religion variable  
  base<-base[, religion:= ifelse(pd_relig=="Hindu",0,ifelse(pd_relig=="Muslim",1,2))]
  base$religion <- factor(base$religion,labels =c("Hindu","Muslim","Others"))
  
###########################################################################################
# Cleaning data by waves - for all these waves, we keep only paired observation
###########################################################################################
#Creating an attrition tracking matrix
  samplesize<-matrix(nrow=4,ncol=3)
  samplesize[1,1] <-"Baseline"
  samplesize[2,1] <-"Age eligible"
  samplesize[3,1] <-"Paired and non-missing sex"
  samplesize[4,1] <-"Non-missing blood pressure readings (both waves)"
  colnames(samplesize) <- c("Observations", "PairA", "PairB")

# Step 1 - Raw count of observations
  samplesize[1,2] <-nrow(base[is.na(year_0)=="FALSE",])
  samplesize[1,3] <-nrow(base[is.na(year_2)=="FALSE",])


################################################################################
# Cleaning Pair 02
################################################################################
# Step 2 - Keep people 30 years and older at pair baseline
  base <- base[, age_calc:= trunc((pd_dob %--% entryd) / years(1))]
  samplesize[2,2] <-nrow(base[is.na(year_0)=="FALSE" & age_calc>=30,])
  datasummary(w02 + w24 ~ N + mean, data=base)

# Step 3 - Keep people who have paired observations, aged 30+ with non-missing sex
  w02 <-base[w02==1,]
  w02 <- w02[age_calc>=30,] 
  w02 <-na.omit(w02, cols=(c("female")))  
  samplesize[3,2] <- nrow(w02)

# Step 4.1 - Drop respondents with missing BP measurements at pair baseline
  w02 <- w02[is.na(systolic_bp_second)=="FALSE" & is.na(systolic_bp_first)=="FALSE",]
  w02 <- w02[is.na(diastolic_bp_second)=="FALSE" & is.na(diastolic_bp_first)=="FALSE",]

# Step 4.2 - Drop respondents with missing BP measurements at pair follow-up
  w02 <- w02[is.na(sbp_fu2_2)=="FALSE" & is.na(sbp_fu2_1)=="FALSE",]
  w02 <- w02[is.na(dbp_fu2_2)=="FALSE" & is.na(dbp_fu2_1)=="FALSE",]
  samplesize[4,2] <- nrow(w02)

# age categories (<40 and 40+) 
  w02 <- w02[, age_calc:= trunc((pd_dob %--% entryd) / years(1))]
  summary(w02$age_calc)
  w02 <- w02[, agecat:= ifelse(age_calc<40,0, ifelse(age_calc>=40,1,NA))]

# level of education 
  # The labels are wrong. Primary school is actually "uncompleted primary school", 
  # High school refers to "secondary school completed". Secondary School / intermediary refers to High School. 
  # This can be verified by looking at the years of schooling in each of the categories.
  w02 <- w02[, edu:= ifelse(pd_edu_stat=="Illiterate" | pd_edu_stat=="Literate" | pd_edu_stat=="Primary school", 0,
                          ifelse(pd_edu_stat == "High school",1,
                                 ifelse(pd_edu_stat=="Secondary school/ Intermediary" | pd_edu_stat=="Graduate" | pd_edu_stat=="Professional degree/post graduate", 2, NA)))]
  table(w02$edu)

# prior diagnosis of cardiometabolic disease
  w02 <- w02[,diabetesknow:=ifelse(pd_diabetes=="Yes",1,ifelse(pd_diabetes=="No",0,NA))] # individual reports diabetes diagnosis at baseline
  w02 <- w02[,heartknow:=ifelse(pd_heart=="Yes",1,ifelse(pd_heart=="No",0,NA))] # individual reports heart attack at baseline
  w02 <- w02[,strokeknow:=ifelse(pd_stroke=="Yes",1,ifelse(pd_stroke=="No",0,NA))] # individual reports stroke at baseline
  w02 <- w02[,anyknow:=ifelse((diabetesknow==1 | heartknow==1 | strokeknow==1),1,ifelse((diabetesknow==0 & heartknow==0 & strokeknow==0),0,NA))] # individual reports any of the three CMDs at baseline

# BP measurements
  # based on the communication with the CARRS team, the average of the last two readings were used to inform respondents. 
    # pair baseline
      w02 <- w02[,sys:=ifelse(is.na(systolic_bp_third=="TRUE"),((systolic_bp_first+systolic_bp_second)/2),((systolic_bp_third+systolic_bp_second)/2))]
      w02 <- w02[,dias:=ifelse(is.na(diastolic_bp_third=="TRUE"),((diastolic_bp_first+diastolic_bp_second)/2),((diastolic_bp_third+diastolic_bp_second)/2))]

    # pair follow-up
      w02 <- w02[,sys_f:=ifelse(is.na(sbp_fu2_3=="TRUE"),((sbp_fu2_1+sbp_fu2_2)/2),((sbp_fu2_3+sbp_fu2_2)/2))]
      w02 <- w02[,dias_f:=ifelse(is.na(dbp_fu2_3=="TRUE"),((dbp_fu2_1+dbp_fu2_2)/2),((dbp_fu2_3+dbp_fu2_2)/2))]
    
  # indicator on individual having high systolic or diastolic BP measurements
    # pair baseline
      w02 <- w02[,syshigh0:=ifelse(sys>=140,1,0)]
      w02 <- w02[,diashigh0:=ifelse(dias>=90,1,0)]
      w02 <- w02[,sysdiashigh0:=ifelse(sys>=140 | dias>=90 ,1,0)]
      
    # pair follow-up
      w02 <- w02[,syshigh2:=ifelse(sys_f>=140,1,0)]
      w02 <- w02[,diashigh2:=ifelse(dias_f>=90,1,0)]
      w02 <- w02[,sysdiashigh2:=ifelse(sys_f>=140 | dias_f>=90 ,1,0)]
    
  # difference in BP between pair baseline and follow-up  
    w02 <- w02[,sysdiff:=sys_f-sys]
    w02 <- w02[,diasdiff:=dias_f-dias]

# keeping relevant variables 
  w02 <- w02 %>% dplyr:: select((grep("pd", names(w02))), (grep("sys", names(w02))), (grep("dias", names(w02))), "w02" , "pid" ,"hhp_id","sys","dias", 
                             "syshigh2","diashigh2","sysdiashigh2","sys_f","dias_f","syshigh0","diashigh0","sysdiashigh0",
                             "female","edu","religion","agecat","age_calc",
                             "diabetesknow" , "heartknow" , "strokeknow","anyknow","entryd","entryd_fup1","city",
                             "mh_hbp","hbp_dis", "hbp_allopathic", "hbp_trt_all")


################################################################################
# Cleaning Pair 02
################################################################################
# Step 2 - Keep people 30 years and older at pair baseline
  base <- base[, age_calc:= trunc((pd_dob %--% entryd_fup2) / years(1))]
  samplesize[2,3] <-nrow(base[is.na(year_2)=="FALSE" & age_calc>=30,])

# Step 3 - Keep people who have paired observations, aged 30+ with non-missing sex
  w24 <-base[w24==1,] 
  w24 <- w24[, age_calc:= trunc((pd_dob %--% entryd_fup2) / years(1))]
  summary(w24$age_calc)
  w24 <- w24[age_calc>=30,] 
  w24 <-na.omit(w24, cols=(c("female"))) 
  samplesize[3,3] <- nrow(w24)

# Step 4.1 - Drop respondents with missing BP measurements at pair baseline
  w24 <- w24[is.na(sbp_fu2_2)=="FALSE" & is.na(sbp_fu2_1)=="FALSE",]
  w24 <- w24[is.na(dbp_fu2_2)=="FALSE" & is.na(dbp_fu2_1)=="FALSE",]

# Step 4.2 - Drop respondents with missing BP measurements at pair follow-up
  w24 <- w24[is.na(bp_s2)=="FALSE" & is.na(bp_s1)=="FALSE",]
  w24 <- w24[is.na(bp_d2)=="FALSE" & is.na(bp_d1)=="FALSE",]
  samplesize[4,3] <- nrow(w24)

# age categories (<40 and 40+) 
  w24 <- w24[, agecat:= ifelse(age_calc<40,0, ifelse(age_calc>=40,1,NA))]

# level of education 
  # The labels are wrong. Primary school is actually "uncompleted primary school", 
  # High school refers to "secondary school completed". Secondary School / intermediary refers to High School. 
  # This can be verified by looking at the years of schooling in each of the categories.
  w24 <- w24[, edu:= ifelse(pd_edu_stat=="Illiterate" | pd_edu_stat=="Literate" | pd_edu_stat=="Primary school", 0,
                            ifelse(pd_edu_stat== "High school",1,
                                   ifelse(pd_edu_stat=="Secondary school/ Intermediary" | pd_edu_stat=="Graduate" | pd_edu_stat=="Professional degree/post graduate", 2, NA)))]
  table(w24$edu)
  
# prior diagnosis of cardiometabolic disease
  w24 <- w24[,diabetesknow:=ifelse(mh_diab=="Yes" | pd_diabetes=="Yes" ,1,ifelse(mh_diab=="No",0,NA))] # individual reports diabetes diagnosis at baseline
  w24 <- w24[,heartknow:=ifelse(mh_heart=="Yes" | pd_heart=="Yes",1,ifelse(mh_heart=="No",0,NA))] # individual reports heart attack at baseline
  w24 <- w24[,strokeknow:=ifelse(mh_stroke=="Yes" | pd_heart=="Yes",1,ifelse(mh_stroke=="No",0,NA))] # individual reports stroke at baseline
  w24 <- w24[,anyknow:=ifelse((diabetesknow==1 | heartknow==1 | strokeknow==1),1,ifelse((diabetesknow==0 & heartknow==0 & strokeknow==0),0,NA))] # individual reports any of the three CMDs at baseline
  
# BP measurements
  # based on the communication with the CARRS team, the average of the last two readings were used to inform respondents. 
  # pair baseline
    w24 <- w24[,sys:=ifelse(is.na(sbp_fu2_3=="TRUE"),((sbp_fu2_1+sbp_fu2_2)/2),((sbp_fu2_3+sbp_fu2_2)/2))]
    w24 <- w24[,dias:=ifelse(is.na(dbp_fu2_3=="TRUE"),((dbp_fu2_1+dbp_fu2_2)/2),((dbp_fu2_3+dbp_fu2_2)/2))]

  # pair follow-up
    w24 <- w24[is.na(bp_s3)=="TRUE",sys_f:=(bp_s1+bp_s2)/2]
    w24 <- w24[is.na(bp_d3)=="TRUE",dias_f:=(bp_d1+bp_d2)/2]
    w24 <- w24[is.na(sys_f)=="TRUE" & is.na(bp_s3)=="FALSE",sys_f:=((bp_s3+bp_s2)/2)]
    w24 <- w24[is.na(dias_f)=="TRUE" & is.na(bp_d3)=="FALSE",dias_f:=((bp_d3+bp_d2)/2)]
    
  # indicator on individual having high systolic or diastolic BP measurements
    # pair baseline
      w24 <- w24[,syshigh2:=ifelse(sys>=140,1,0)]
      w24 <- w24[,diashigh2:=ifelse(dias>=90,1,0)]
      w24 <- w24[,sysdiashigh2:=ifelse(sys>=140 | dias>=90 ,1,0)]

    # pair follow-up
      w24 <- w24[,syshigh4:=ifelse(sys_f>=140,1,0)]
      w24 <- w24[,diashigh4:=ifelse(dias_f>=90,1,0)]
      w24 <- w24[,sysdiashigh4:=ifelse(sys_f>=140 | dias_f>=90 ,1,0)]
      summary(w24$syshigh4)
      
    # difference in BP between pair baseline and follow-up  
      w24 <- w24[,sysdiff:=sys_f-sys]
      w24 <- w24[,diasdiff:=dias_f-dias]

# keeping relevant variables 
  w24 <- w24 %>% dplyr:: select((grep("pd",  names(w24))),
                               (grep("sys", names(w24))), (grep("dias", names(w24))),
                               "pid" ,"hhp_id","w24","sys","dias",
                               "female","edu","religion","agecat","age_calc",
                               "syshigh2","diashigh2","sysdiashigh2","sys_f","dias_f","syshigh4","diashigh4","sysdiashigh4",
                               "fo_hypert", "diabetesknow" , "heartknow" , "strokeknow","anyknow","entryd_fup3","entryd_fup2","mh_hbp","city",
                               "hbp_dis", "hbp_allopathic", "hbp_trt_all")

###########################################################################################
# Creating the analysis datasets
###########################################################################################
  
#identifying the different rounds
  cols = c("w02", "w24")
  carrs<- rbindlist(list(w02,w24),fill = TRUE)
  carrs[ , (cols) := lapply(.SD, nafill, fill=0), .SDcols = cols]  

# defining the following columns as factor for the summary characteristic tables
  cols <- c("diabetesknow","heartknow", "strokeknow", "religion")
  setDT(carrs )[, (cols):= lapply(.SD, factor), .SDcols=cols]

#creating a factor version of the education and age category variables and labeling them for the summary characteristic tables
  carrs$eduf<-factor(carrs$edu,labels =c("Primary","Secondary","Tertiary"))
  carrs$agecatf<-factor(carrs$agecat,labels=c("$<$40 years","$\\geq$40 years"))

###########################################################################################
# Running variables for diagnosis outcome
###########################################################################################

# main analysis
  carrs<-carrs[,runningvar:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                             ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs<-carrs[female==1,runningvar_female:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                             ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  
  carrs<-carrs[female==0,runningvar_male:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                           ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]

# by education group
  carrs<-carrs[edu == 0,runningvar_edu0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                          ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs<-carrs[edu == 1,runningvar_edu1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                          ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs<-carrs[edu == 2,runningvar_edu2:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                          ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  
  carrs<-carrs[female==1 & edu == 0,runningvar_female_edu0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                             ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs<-carrs[female==1 & edu == 1,runningvar_female_edu1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                             ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs<-carrs[female==1 & edu == 2,runningvar_female_edu2:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                             ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  
  carrs<-carrs[female==0 & edu == 0,runningvar_male_edu0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                           ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs<-carrs[female==0 & edu == 1,runningvar_male_edu1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                           ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs<-carrs[female==0 & edu == 2,runningvar_male_edu2:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                           ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]

# by age group
  carrs<-carrs[agecat == 0,runningvar_age0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                           ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]

  carrs<-carrs[agecat == 1,runningvar_age1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                             ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs<-carrs[female==1 & agecat == 0,runningvar_female_age0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                                ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs<-carrs[female==1 & agecat == 1,runningvar_female_age1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                                ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs<-carrs[female==0 & agecat == 0,runningvar_male_age0:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                              ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]
  
  carrs<-carrs[female==0 & agecat == 1,runningvar_male_age1:=ifelse((((sys-140)/(sd(sys))) == ((dias-90)/(sd(dias)))),(sys-140)/(sd(sys)),
                                                                              ifelse((((sys-140)/(sd(sys))) > ((dias-90)/(sd(dias)))),((sys-140)/(sd(sys))),((dias-90)/(sd(dias)))))]



###########################################################################################
# Summaries of dataset
###########################################################################################

datasummary(w02 + w24 ~ N , data=carrs)

samplesize<-as.data.table(samplesize)
samplesize[, 2:ncol(samplesize)] <- lapply(2:ncol(samplesize), 
                                           function(x) as.numeric(samplesize[[x]]))
samplesize<-samplesize %>% mutate(All=PairA+PairB)

###########################################################################################
# Saving the datasets
###########################################################################################

write.csv(carrs, "carrs_bp.csv", row.names=FALSE)
save(carrs, file = "carrs_bp.Rdata")

write.csv(samplesize, "samplesize_bp.csv", row.names=FALSE)
save(samplesize, file = "samplesize_bp.Rdata")